library(tidyverse)
library(stringr)
library(randomForest) 
library(vip) 
library(rpart)
library(rpart.plot)
library(caret)
library(tidytext)

Research Data

The dataset contains 2515 unique videos and their subtitles from over 91 different YouTubers, ranging from all different kinds of categories.

df_raw <- read.csv("data.csv")
head(df_raw)

Research Goal

Do a sentiment analysis on the subtitles and find the best multiple linear regression model to predict the number of views using Subscribers, CC, Released, Category, Sentiment and Length.

Tasks

  1. Data Cleaning.
  2. Conduct a sentiment analysis on the subtitles.
  3. Try various statistical models like linear regression, decision tree and random forest.
  4. Compare these models in terms of prediction performation and interpretability.

Data Cleaning

df_raw %>% select(Views, Subscribers, Released, Length) %>% head(10)

Looking at the data, we notice several problems in the data, like:

  1. Views: The Views variable is in string format and the units are different, like “10K views”, “10M views”. We prefer it to be in number and in the same unit in order to conduct statistical analysis.

  2. Subscribers: The same problems as Views. The Subscribers variable is like “10K subscribers”, “10M subscribers”.

  3. Length: The video length is in string format, like “12:00”, “1:12:00”. We need it to be in number and in the same unit.

  4. Released: The Released variable is in string format, like “2 years age”, “10 month ago”. We need it to be in number and in the same unit.

Therefore, we need to do data cleaning first.

# Unify units and convert string to number, like: 10K views -> 10, 10M views -> 10000
cleanViews <- function(str) {
  str <- str_remove(str, " views")
  last <- str_sub(str, -1)
  views <- str %>% str_remove(last) %>% as.numeric()
  if (last == "M") return(1000*views)
  else return(views)
}

# Unify units and convert string to number, like: 10K subscribers -> 10, 10M subscribers -> 10000
cleanSubscribers <- function(str) {
  str <- str_remove(str, " subscribers")
  last <- str_sub(str, -1)
  views <- str %>% str_remove(last) %>% as.numeric()
  if (last == "M") return(1000*views)
  else return(views)
}

# Convert time in string format to number of minutes, like: 12:00 -> 12, 1:12:00 -> 72
cleanLength <- function(str) {
  list <- str_split(str, ":")
  len <- length(list[[1]])
  if (len == 3) {
     h <- as.numeric(list[[1]][1])
     m <- as.numeric(list[[1]][2])
     return((m + 1) + 60*m)
  } else {
     m <- as.numeric(list[[1]][1])
     return(m+1)
  }
}

# Convert time to number of months ago, like: 1 years ago -> 12, 10 months ago to 10
cleanReleased <- function(str) {
  str <- str_remove(str, "Streamed ")
  list <- str_split(str, " ")
  if (list[[1]][2] == "years") return(as.numeric(list[[1]][1])*12)
  else return(as.numeric(list[[1]][1]))
}

# Remove NAs
df <- df_raw %>%  
  filter(
    !is.na(Released) & Released != ""
  ) 

# Clean the data
df <- df %>% mutate(
  Views = map_dbl(Views, cleanViews),
  Subscribers = map_dbl(Subscribers, cleanSubscribers),
  Length = map_dbl(Length, cleanLength),
  Released = map_dbl(Released, cleanReleased)
)

df %>% select(Views, Subscribers, Released, Length) %>% head(10)

# Save for future use
write_csv(df, "cleaned_data.csv")

After cleaning, Views, Subscribers, Released and Length are numbers, while Views, Subscribers are in K and Released and Length are in minute.

Data Discovery

df <- read_csv("cleaned_data.csv")
## Rows: 2098 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): Id, Channel, Title, URL, Category, Transcript
## dbl (5): Subscribers, CC, Released, Views, Length
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df$CC <- as.factor(df$CC)
df$Category <- as.factor(df$Category)
df$Subscribers <- as.numeric(df$Subscribers)

head(df)
table(df$CC)
## 
##    0    1 
## 1094 1004
boxplot(log(Views) ~ CC, data = df)

summary(df$Length)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    6.00   11.00   26.94   17.00 3539.00
summary(df$Released)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00   24.00   36.00   46.46   60.00  156.00
summary(df$Subscribers)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     179    1730    4010    6871    8240   89700
pairs(Views ~ CC + Released + Length + Subscribers + Category, data=df)

Sentiment Analysis / Text mining

TODO by Xinyi

df_script <- df %>% 
  select(Title, Transcript)
head(df_script)
data("stop_words")
custom_stop_words <- rbind(stop_words, c("_", "custom"))
#bigram
df_script %>% 
  group_by(Title) %>% 
  unnest_tokens(word, Transcript, token = "ngrams", n = 2) 
df_script<- df_script %>% 
  group_by(Title) %>% 
  unnest_tokens(word, Transcript) %>% 
  anti_join(custom_stop_words) %>% 
  count(word, sort = TRUE) %>% 
  mutate(total = sum(n)) %>% 
  ungroup()
## Joining, by = "word"
  #inner_join(get_sentiments("afinn")) #%>% 
  #summarise(sentiment = sum(value))
  #mutate(total = sum(word)) %>% 
  #mutate(perc = round(n/total, 2))

head(df_script)
df_script %>% 
  group_by(Title) %>% 
  inner_join(get_sentiments("afinn")) %>% 
  mutate(afinn_score = sum(value)) %>% 
  mutate(perc = round(n/total, 2))
## Joining, by = "word"

Preparation for Cross-Validation

Randomly split the data set in a 70% training and 30% test set. Make sure to use set.seed() so that your results are reproducible

set.seed(652)
n <- nrow(df)
train_index <- sample(1:n, round(0.7*n))
df_train <- df[train_index,]
df_test <- df[-train_index,]

Linear Regression

lm1 <- lm(Views ~ CC + Released + Length + Subscribers, data=df)
summary(lm1)
## 
## Call:
## lm(formula = Views ~ CC + Released + Length + Subscribers, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -27428  -4097  -1619   1845 299580 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  996.87365  540.88021   1.843  0.06546 .  
## CC1         1634.75349  533.44242   3.065  0.00221 ** 
## Released      20.86454    9.12050   2.288  0.02226 *  
## Length        -2.23572    1.44601  -1.546  0.12222    
## Subscribers    1.37690    0.02726  50.517  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11950 on 2093 degrees of freedom
## Multiple R-squared:  0.561,  Adjusted R-squared:  0.5602 
## F-statistic: 668.7 on 4 and 2093 DF,  p-value: < 2.2e-16

Regression Tree

Fit a regression tree on the training set.

# Fit tree model
t1 <- rpart(Views ~ CC + Released + Category + Length + Subscribers,
            data = df_train,
            method = "anova")

# Plot the desicion tree
rpart.plot(t1)

# Plot R-square vs Splits and the Relative Error vs Splits.
rsq.rpart(t1)
## 
## Regression tree:
## rpart(formula = Views ~ CC + Released + Category + Length + Subscribers, 
##     data = df_train, method = "anova")
## 
## Variables actually used in tree construction:
## [1] Category    Length      Subscribers
## 
## Root node error: 5.1207e+11/1469 = 348582113
## 
## n= 1469 
## 
##         CP nsplit rel error  xerror    xstd
## 1 0.310493      0   1.00000 1.00150 0.24648
## 2 0.110528      1   0.68951 0.73970 0.19123
## 3 0.067016      2   0.57898 0.65220 0.19748
## 4 0.044738      3   0.51196 0.60975 0.19366
## 5 0.031356      4   0.46722 0.59609 0.19321
## 6 0.017075      5   0.43587 0.55910 0.19210
## 7 0.016164      6   0.41879 0.56347 0.19921
## 8 0.010000      7   0.40263 0.54809 0.19903

Make predictions on the test set and compute the RMSE

# Make prediction
pred_tree <- predict(t1, newdata = df_test)

# Compute the RMSE
RMSE(df_test$Views, pred_tree)
## [1] 7977.73

Random Forest

Fit a Random Forest on the training set usinng the defaults for mtry and ntree.

set.seed(652)
rf1 <- randomForest(Views ~ CC + Released + Category + Length + Subscribers, importance = TRUE, data = df_train)
rf1
## 
## Call:
##  randomForest(formula = Views ~ CC + Released + Category + Length +      Subscribers, data = df_train, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 175526230
##                     % Var explained: 49.65

Use the vip() function to make a variable importance plot. Which variables are most important?

vip(rf1, num_features = 14,  include_type = TRUE)

Make predictions on the test set and compute the RMSE

# Make prediction
pred_rf <- predict(rf1, newdata = df_test)

# Compute the RMSE
RMSE(df_test$Views, pred_rf)
## [1] 7645.176

Conclusion(TODO)

Model RMSE R Squared Number of Coefficients performance interpretability
linear regression - - - - -
regression tree - - - - -
random forest - - - - -

Regression Tree VS Random Forest

Aggregated/ensemble models are not universally better than their “single” counterparts, they are better if and only if the single models suffer of instability. With XX training rows and only XX columns, we are in a comfortable training sample size situation in which even a decision tree may get reasonably stable.